/***************************************************************************
 *
 * Copyright (C) 2001 International Business Machines
 * All rights reserved.
 *
 * This file is part of the GPFS mmfslinux kernel module.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 *
 *  1. Redistributions of source code must retain the above copyright notice, 
 *     this list of conditions and the following disclaimer. 
 *  2. Redistributions in binary form must reproduce the above copyright 
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution. 
 *  3. The name of the author may not be used to endorse or promote products 
 *     derived from this software without specific prior written
 *     permission. 
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *************************************************************************** */
/*
 * Superblock operations
 *
 * Contents:
 *   TraceBKL
 *   #define NODE_TOO_BIG_FOR_OSNODE
 *   gpfs_s_read_inode2
 *   gpfs_s_read_inode
 *   gpfs_s_delete_inode
 *   gpfs_s_notify_change
 *   gpfs_s_put_super
 *   gpfs_s_statfs
 *   gpfs_s_umount_begin
 *   gpfs_s_remount
 *   gpfs_s_write_inode
 *   gpfs_s_clear_inode
 *   gpfs_s_write_super
 *   gpfs_read_super
 *   gpfs_reg_fs
 *   gpfs_unreg_fs
 *   mmfsd_release
 *   kill_mmfsd
 *   swapd_s_clear_inode
 *   swapd_s_read_inode
 *   cxiRemoveOSNodeUnused
 *   cxiAddOSNodeUnused
 *   get_myinode
 *   exec_mmfs
 *   fork_mount_helper
 *   cxiSetMountInfo
 *   unmountInternal
 *   cxiReactivateOSNode
 *   cxiNewOSNode
 *   cxiFreeOSNode
 *   cxiDeleteMmap
 *   cxiReinitOSNode
 *   cxiFindOSNode
 *   cxiDumpOSNode
 *   cxiRefOSNode
 *   cxiInactiveOSNode
 *   cxiPutOSNode
 *   cxiDestroyOSNode
 *   cxiSetOSNodeType
 *   cxiUpdateInode
 *   cxiCanUncacheOSNode
 *   cxiAddOSNode
 *
 * $Id: super.c,v 1.57.2.3 2002/05/21 21:44:58 dcraft Exp $
 *
 * $Log: super.c,v $
 * Revision 1.57.2.3  2002/05/21 21:44:58  dcraft
 * Pull GPFS 1.2.1 up to kernel 2.4.18.
 * mmfsfuncs.Linux must be distributed with /usr/lpp/mmfs/src
 * on developerworks.
 *
 * Revision 1.57.2.2  2002/05/20 20:06:38  dixonbp
 * Set s_maxbytes (maximum filesize) in the super block.  Later versions of the
 * 2.4.9-xx kernels block NFS file writes that exceed this size.  Previously, the
 * size was hard-coded in nfsd (not obtained from the filesystem).
 *
 * Revision 1.57.2.1  2002/04/17 06:12:07  mcnabb
 * Fix another bug in multi-node delete-on-last-close on Linux:
 * If a file is accessed on Node A, renamed on Node B, and then deleted on
 * Node B, an invalidated dcache entry referring to the old name of the
 * file was left over on Node A, preventing the file from being destroyed.
 *
 * Revision 1.57  2001/10/10 20:24:49  dcraft
 * mount -t gpfs /dev/<dev/ /<mnt> caused OOPS.  Found during
 * PD guide verification and Puneet in agreement to fix.
 *
 * Revision 1.56  2001/10/09 17:45:37  dcraft
 * Fixes for running on 2.4.9-ac kernel series. (behind ifdefs)
 *
 * Revision 1.55  2001/09/28 20:46:55  wyllie
 * Include more operations in vfsstats counters
 *
 * Revision 1.54  2001/09/27 12:29:02  gjertsen
 * Do not decrement module use count in umount_begin (forced unmount)
 * because put_super will take care of this later on.
 *
 * Revision 1.53  2001/09/25 18:22:20  gjertsen
 * Remove obsolete IA64 code.
 *
 * Revision 1.52  2001/09/22 20:08:10  dcraft
 * Remove kiobufs from cxiKernelIODescriptor_t.  Use temporary
 * kiobufs for map/unmap.   Remove dead code and dead comments
 * in portability layer and update readmes and license.
 * Fix traceback to appear in mmfs.log file.
 *
 * Revision 1.51  2001/09/20 17:56:38  gjertsen
 * Add logassert for CXINODE_SIZE.
 *
 * Revision 1.50  2001/09/20 05:57:16  wsawdon
 * Renamed va_xperm to va_xinfo and definef bit flags to allow var to be
 * shared between extended permissions and snapLatest entries.
 *
 * Revision 1.49  2001/09/12 05:45:52  schmuck
 * On ACL changes, permission information cached in the Linux inode
 * was not being updated correctly.
 *
 * Revision 1.48  2001/09/12 05:35:54  schmuck
 * Undo last change; broke Linux build:
 * mmfs_mounts_lock/mmfs_mounts_unlock not available.
 *
 * Revision 1.47  2001/09/12 03:10:31  mcnabb
 * Fix for defect 348037: Node panic during force unmount - GPFS in stack
 * Make sure only one thread calls gpfsFinishUnmount.
 *
 * Revision 1.46  2001/08/10 18:22:32  gjertsen
 * Put errno predeclaration for kernel code in Shark-gpl.h.
 *
 * Revision 1.45  2001/08/06 23:38:00  wyllie
 * Add contents section.  Change trace of BKL.
 *
 * Revision 1.44  2001/08/04 00:42:27  tee
 * Remove LINUX_MMAP ifdefs
 *
 * Revision 1.43  2001/07/30 07:13:35  mcnabb
 * Fix handling of vnodes in AIX when soft mounts exist. Unmount was
 * asserting that only one vnode existed, when in fact a gnode could
 * have multiple vnodes that need to be cleaned up independently.
 * Note: mmap usage of GNP_TO_VP is still incorrect, but is being
 * fixed a different way.
 *
 * Revision 1.42  2001/07/20 16:06:30  dcraft
 * defect 344204 (problem in cygnus as well)
 *
 * Revision 1.41  2001/07/19 23:25:05  dcraft
 * Modified linux trace to allow non blocking trace record
 * writes (format is TRACE?N).  New gpfs swapd process created
 * which is responsible for reclaiming inodes (5 percent every
 * time it runs).  Marked all our inodes so that they would be
 * ignored by linux kswapd.  Added "unused" inode to inode
 * cache that could be used as a signal that linux kswapd is
 * running and kick off gpfs swapd.  Added means to ignore attempts
 * to kill mmfsd by kswapd if the system gets low on memory.
 * All done in an attempt to avoid kswapd premature wakeup on simple
 * locks and mutexes.
 *
 * Revision 1.40  2001/06/27 17:36:25  wsawdon
 * Added some trace statements.
 *
 * Revision 1.39  2001/06/06 21:55:35  wyllie
 * Change the way NFSData objects are timed out.  Instead of using watchdog
 * timers that enqueue NFSData objects at interrupt level for later
 * processing by the NFS watchdog kernel process, just let the watchdog
 * process wake up periodically and examine its LRU list of NFSData objects
 * to decide which should be closed.  This fixes a bug on Linux that shows
 * up as the assert "nfsP->nextWatchP == NFS_WATCH_NOT_QUEUED" under high
 * load.  This also allows deleting a bunch of cxiXXX interfaces dealing
 * with watchdog timers.
 *
 * Revision 1.38  2001/06/01 18:51:32  gjertsen
 * Introduce macros for intentional conversion between pointers
 * and 32-bit integers so that needless compiler warnings are
 * filtered out while important warnings are now more apparent.
 *
 * Revision 1.37  2001/05/30 20:41:58  wyllie
 * Purge Linux inode when give up token for a file in the stat cache.
 *
 * Revision 1.36  2001/05/25 14:48:23  gjertsen
 * Minor fixes to get IA64 code to compile again.
 *
 * Revision 1.35  2001/05/18 21:35:13  schmuck
 * Fix error handling in findOrCreate/cxiNewOSNode/read_inode.
 *
 * Revision 1.34  2001/05/12 18:38:27  schmuck
 * Fix problem with NFS returning spurious errors under load.
 *
 * Revision 1.33  2001/05/10 17:07:33  wyllie
 * Trace inode numbers in decimal
 *
 * Revision 1.32  2001/05/09 23:04:32  eshel
 * Use malloc for gpfsNode if it is to big for the linux inode.
 *
 * Revision 1.31  2001/05/08 20:00:19  gjertsen
 * Account for name change with sync daemon in 2.4.2 kernel.
 *
 * Revision 1.30  2001/05/04 23:30:13  schmuck
 * Move "extern struct xxx_operations ..." declarations into .h file
 * instead of replicating them in various .c files.
 * Replace empty gpfs_dops_valid table with a NULL pointer.
 *
 * Revision 1.29  2001/05/04 18:12:59  eshel
 * Storing the gnode inside the linux inode.
 *
 * Revision 1.28  2001/05/03 01:52:52  schmuck
 * Add comment to explain weird looking DBGASSERT.
 *
 * Revision 1.27  2001/05/02 20:41:03  gjertsen
 * Modify use of daemon_fops to reflect "recent" changes in Linux kernel.
 *
 * Revision 1.26  2001/05/02 00:21:24  schmuck
 * Fix another problem found by tortureDir test on Linux:
 * On lookup and create, instantiate the dcache entry while holding the
 * inode lock, or, in case of a negative dcache entry, the directory lock.
 * This closes a window where a token revoke could clear the
 * CO_VFS_REFERENCE flag without invalidating the dcache entry.
 * It also eliminates the need for a d_revalidate operation.
 *
 * Revision 1.25  2001/05/01 20:06:53  wsawdon
 * Fix for Raleigh defect 1947.
 * Linux kernel code must locate gnodes in the same
 * way that it locates its inode structures.
 *
 * Revision 1.24  2001/04/27 19:48:34  wyllie
 * Do console print and traces early in logAssertFailed, so that the point
 * of the failure can more easily be found in the trace.  Add trace to
 * cxiSetMountInfo.
 *
 * Revision 1.23  2001/04/25 20:22:09  eshel
 * Make sure we get the i_rdev set at the end of mknod call.
 *
 * Revision 1.22  2001/04/24 23:11:32  eshel
 * Add the KXIVO_DESTROY flag to tell kxInvalidateOSNode() to set i_nlink in the
 * linux inode to zero on from MNodeToken::token_revoke() with the
 * CTM_A_BEING_DESTROYED on. The file is being destroyed.
 *
 * Revision 1.21  2001/04/24 00:55:10  eshel
 * Set vector table for special files so gpfs will not get these operations.
 *
 * Revision 1.20  2001/04/23 21:09:42  radhak
 * Defect 337635:
 * When deleting an inode, the cxiFreeOSNode() is setting
 * inode->i_mapping->a_ops pointer to NULL causing segmentation in
 * truncate_list_inode_pages() while cleaning mmaped pages.
 *
 * Revision 1.19  2001/04/23 18:11:27  eshel
 * Rename createThreadId to createRaceLoserThreadId and fix logic error.
 *
 * Revision 1.18  2001/04/20 23:03:08  eshel
 * If a regular file is created by the sys_open call (for now we can not tell if
 * the call is from sys_mknod or sys_open) and the file is found, return rc 0,
 * remember the thread that called create. Later on the open call the open flags
 * are available and if it is the same thread, and FEXCL was on fail it with
 * EEXIST, also check permission since linux assumes that this process created
 * the file and did not do any permission check.
 *
 * Revision 1.17  2001/04/18 14:13:20  dixonbp
 * Detect delete_inode calls that result from nfsd calling iput after it
 * invokes iget directly on a stale inode.
 *
 * Revision 1.16  2001/04/17 19:58:42  dcraft
 * Defect 337521.  igrab() failed yet we still attempted an iput().
 *
 * Revision 1.15  2001/04/17 16:41:59  dixonbp
 * Defect 336887: gpfs_read_super was putting the mount options past the end
 * of its buffer.
 *
 * Revision 1.14  2001/04/16 21:08:09  eshel
 * Add option to update file size under a spin_lock and only if it is bigger.
 *
 * Revision 1.13  2001/04/13 00:35:09  dcraft
 * igrab can fail so check return code
 *
 * Revision 1.12  2001/04/11 21:05:00  schmuck
 * Remove gpfs_s_put_inode (this only called gpfsRele, and since put_inode is
 * always called while i_count is still non-zero, gpfsRele doesn't do anything).
 * Define TraceBKL to be a no-op when compiling without VERBOSETRACE.
 *
 * Revision 1.11  2001/04/09 23:27:37  eshel
 * update time changes in OS node (no nanoseconds for now)
 *
 * Revision 1.10  2001/04/09 21:06:01  eshel
 * Add code to keep OS node (linux inode) attributes up to date.
 *
 * Revision 1.9  2001/04/09 20:52:53  dcraft
 * correct the loss of file mode
 *
 * Revision 1.8  2001/04/08 22:18:31  dcraft
 * Fix multinde delete race conditions.  Still incomplete.
 *
 * Revision 1.7  2001/04/06 21:00:16  gjertsen
 * Selectively screen out regular sync calls from kupdate.
 *
 * Revision 1.6  2001/04/04 22:29:05  eshel
 * add cxiUpdateInode() to keep linux inode up to date
 *
 * Revision 1.5  2001/04/04 21:14:45  dcraft
 * Invalidate inode attributes (particularly i_nlink) when getVattr() can no longer
 * find inode.   Update attributes after rename over an existing file, so d_move
 * will correctly kill target dentry.   Add printing of dentries when "mmfsadm dump vnodes"
 * is executed.  Initial implementation of gpfs_d_revalidate.
 *
 * Revision 1.4  2001/04/03 17:21:38  eshel
 * Add delete_inode() to mark inode for destruction when on last linux iput() call.
 *
 * Revision 1.3  2001/03/29 18:26:51  dixonbp
 * Convert super.C to super.c
 *
 * Revision 1.70  2001/03/11 23:15:42  dcraft
 * cxiCanUncacheOSNode must check i_count, otherwise unCache will
 * get in a tight loop over its inability to remove the gpfsNode
 *
 * Revision 1.69  2001/03/05 23:28:13  dcraft
 * Modify inode and gpfsNode reference management.  Inode is now acquired
 * during gpfsNode creation and must be released via cxiPutOSNode().
 * (documented in gpfs database).  Add "mmfsadm dump vnodes" for producing
 * trace info on all held inodes.
 *
 * Revision 1.68  2001/03/05 20:57:16  eshel
 * Move DBGASSERT(holdCount == -1) up.
 *
 * Revision 1.67  2001/03/05 17:40:47  eshel
 * add trace calls
 *
 * Revision 1.66  2001/03/02 22:17:41  eshel
 * The pointer to the linux inode can be null under some error condition, so make
 * sure it is not null before using it and when referencing the hold count in
 * the inode while the pointer is null assume it to be zero.
 *
 * Revision 1.65  2001/03/01 20:44:24  radhak
 * Need serialization between nopage and mmap flush.
 * Also, always get page table lock while holding page lock.
 *
 * Revision 1.64  2001/03/01 19:59:18  dixonbp
 * Handle errors from ReleaseNFS and cxiRefOSNode
 *
 * Revision 1.63  2001/02/27 01:15:00  eshel
 * check if deamon is ready before starting a mount
 *
 * Revision 1.62  2001/02/08 18:03:41  schmuck
 * Tweak fast path through gpfs_i_permission for files that don't have
 * extended acls: instead of doing the check in gpfs_i_permission,
 * set inode_operations::permission to NULL so Linux will do the check
 * without invoking gpfs_i_permission.
 * No functional change.
 *
 * Revision 1.61  2001/01/24 02:08:51  schmuck
 * Ignore calls to gpfs_write_super, since they occur too frequently
 * (we can't afford to do a complete sync every 5 seconds).
 * Instead, rely on the sync watchdog to sync file systems periodically.
 * Unfortunately this means that the "sync" command on Linux is now broken.
 *
 * Revision 1.60  2001/01/12 22:18:05  eshel
 * add test to trace message.
 *
 * Revision 1.59  2000/12/19 21:11:58  wyllie
 * Remove assertions and traces about the state of the Linux BKL.  Linux does
 * not keep track of who owns the lock, so these asserts were blowing up on
 * an SMP if the kernel lock happened to be held by the other processor.
 *
 * Revision 1.58  2000/12/18 13:53:20  gjertsen
 * More cleanup of comments/documentation.
 *
 * Revision 1.57  2000/12/15 13:56:50  gjertsen
 * Clean up documentation.
 *
 * Revision 1.56  2000/12/09 20:33:37  schmuck
 * Instead of setting/checking flags to invalidate/revalidate dcache entries,
 * simply set dentry::d_ops to point to different dentry_operations tables:
 * one where the d_revalidate field is NULL (means the dentry is valid),
 * and one where d_revalidate points at a function that always returns false
 * (means the dentry is invalid).
 *
 * Revision 1.55  2000/12/07 21:38:25  schmuck
 * Add a call to invalidate stat information cached in the Linux inode when
 * the inode token is relinquished/downgraded, so that gpfs_i_revalidate
 * optimization works correctly with multiple nodes.
 * Add similar optimization for gpfs_i_permission.
 * Remove NB_PERF ifdef from this code.
 *
 * Revision 1.54  2000/12/04 18:20:37  eshel
 * Zero privVfsP pointer in the super block only after unmount is complete and
 * remove some check for null pointer.
 *
 * Revision 1.53  2000/12/04 17:49:55  wyllie
 * Do not allow 'read' of a directory, only 'readdir'.  Otherwise the prefetch
 * code could build a buffer descriptor for a hole in a directory, and the read
 * would populate the buffer with binary zeros.  A later fast lookup would find
 * a directory block with an invalid format and barf.
 *
 * Revision 1.52  2000/12/01 02:11:00  schmuck
 * Instead of assigning NULL function pointers when initializing or resetting the
 * gpfs_operations table, have it point to a dummy function that returns ENOSYS.
 * This avoids having to check for NULL before each call.
 *
 * Revision 1.51  2000/11/13 16:45:43  wyllie
 * Comment a naked decimal constant
 *
 * Revision 1.50  2000/11/07 00:16:22  eshel
 * Add code to support remount.
 *
 * Revision 1.49  2000/11/06 19:56:16  gjertsen
 * Linux code cleanup and put in build safeguards.
 *
 * Revision 1.48  2000/11/03 20:27:03  dcraft
 * Build SMP, UP, NOHIGHMEM, and 4G memory variations of mmfslinux in
 * one pass.   Names are mmfslinux_UP1G, mmfslinux_UP4G, mmfslinux_SMP1G,
 * mmfslinux_SMP4G.
 *
 * Revision 1.47  2000/11/02 19:46:37  gjertsen
 * Linux code split. Pull out NBD stuff.
 *
 */

#include <Shark-gpl.h>

#include <linux/string.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/locks.h>
#include <linux/smp_lock.h>

#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>
#include <asm/uaccess.h>   /* KERNEL_DS */

#define FOOBAR #error Do not do this

/* GPFS headers */
#include <cxiSystem.h>
#include <cxiAtomic.h>
#include <cxi2gpfs.h>
#include <cxiSharedSeg.h>
#include <cxiCred.h>
#include <linux2gpfs.h>
#include <Trace.h>
#include <cxiVFSStats.h>

/* From GPFS FSDisk.h which can't be included */
#define INODENUM_ROOTDIR_FILE 3

extern struct file_system_type  gpfs_fs_type;
extern struct file_operations   daemon_fops;
extern struct inode_operations  daemon_inodeops;


static DECLARE_WAIT_QUEUE_HEAD(pwq);

static inline _syscall2(int,kill,int,pid,int,sig);
int mmfsd_module_active = 0;
static int mmfsd_id = -1;
static int mount_id = -1;
char mountCmd[CXI_MAXPATHLEN+1] = "M ";
char mmfs_path[CXI_MAXPATHLEN+1] = "";
char bin_path[CXI_MAXPATHLEN+1];
static char mount_opt[CXI_MAXPATHLEN+1];

/* forward declarations */
int unmountInternal(struct super_block *sbP, Boolean force);

/* Routine to trace whether kernel lock is held */
#ifdef VERBOSETRACE
void TraceBKL()
{
  TRACE2(TRACE_VNODE, 10, TRCID_VNODE_BKL,
         "BKL %d lock_depth %d\n", kernel_locked(), current->lock_depth);
}
#endif

/* Check whether the full cxiNode_t fits inside a Linux inode in the
   part of the iP->u union after the u.generic_ip pointer field */
#define NODE_TOO_BIG_FOR_OSNODE(nodeSize) \
  ((sizeof(((struct inode *)0)->u) - sizeof(void *)) < nodeSize)


/* This routine is called from iget() just after allocating a new inode.
   This is a variant of the normal read_inode operation that allows passing an
   opaque parameter through iget4 into read_inode2.  We need the parameter to
   know whether read_inode2 is being called from a normal lookup opration,
   where we are already holding a distributed lock on the file, or from nfs
   calling iget, where we need to get the lock inside of read_inode2.

   Note: In the Linux source the call to read_inode2 is labelled a "reiserfs
   specific hack" with the additional warning "We don't want this to last, and
   are looking for VFS changes that will allow us to get rid of it." If and
   when such a change is made, we will hopefully be able to adapt our code
   accordingly.  Otherwise, if read_inode2 goes away without a suitable
   replacement, we will have to use a more expensive approach, e.g., a global
   table where lookup would leave some state before calling iget. */
void
gpfs_s_read_inode2(struct inode *iP, void *opaque)
{
  struct gpfsVfsData_t *privVfsP;
  ino_t inum = iP->i_ino;
  cxiNode_t *cnP;
  int rc;

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_READINODE_ENTER,
         "gpfs_s_read_inode2 enter: inode 0x%lX inode %d\n",
         iP, inum);
  /* BKL is sometimes held at entry */

  /* allocate cxiNode_t */
  if (NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE))
  {
    /* need to allocate separate storage for the cxiNode_t */
    cnP = (cxiNode_t *)cxiMallocUnpinned(CXINODE_SIZE);
    if (cnP == NULL)
      goto exit_bad;
    memset(cnP, 0, CXINODE_SIZE);
  }
  else
  {
    /* we can store the cxiNode_t in the part of the iP->u uninon
       after the u.generic_ip field */
    cnP = (cxiNode_t *)(&iP->u.generic_ip + 1);
  }

  TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_NEW_VNODE_1,
         "gpfs_s_read_inode2: iP 0x%lX cnP 0x%lX uSize-void* %d nodeSize %d",
         iP, cnP, sizeof(iP->u) - sizeof(void *), CXINODE_SIZE);

  /* initialize cxiNode_t; assigning zero commented out, because already done
     by memset above or by memset(&inode->u, ....) in clean_inode() */
  // cnP->mapSeg = NULL;
  // cnP->mapReadCount = 0;
  // cnP->mapWriteCount = 0;
  // cnP->readCount = 0;
  // cnP->writeCount = 0;
  // cnP->execCount = 0;
  // cnP->icValid = 0;
  // cnP->xinfo = 0;
  // cnP->mmapFlush = false;
  // cnP->destroyIfDelInode = false;
  // cnP->createRaceLoserThreadId = 0;

  /* connect cxiNode_t to struct inode */
  cnP->osNodeP = iP;
  iP->u.generic_ip = cnP;

  /* get inode attributes */
  privVfsP = VP_TO_PVP(iP);
  rc = gpfs_ops.gpfsInodeRead(privVfsP, cnP, inum, opaque);

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_READINODE_EXIT,
         "gpfs_s_read_inode2 exit: inode 0x%lX rc %d",
         iP, rc);

  if (rc == 0)
  {
    /* Our inodes are marked in a special way such that kswapd 
     * will ignore them.  We set up one unused inode (see cxiAddOSNodeUnused)
     * to be used as a signal when kswapd runs.  Can't set this for 
     * the root dir inode or the umount will fail with EBUSY.  Its irrelevent
     * for root because it always has an i_count until umount and is thus
     * never a candidate for kswapd.
     */
#define I_IGNORE 1024
    if (iP->i_ino != INODENUM_ROOTDIR_FILE)
      iP->i_state |= I_IGNORE;
    
    return;  // success!
  }

  /* undo cxiNode_t allocation */
  if (NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE))
    cxiFreeUnpinned(cnP);
  iP->u.generic_ip = NULL;

exit_bad:
  /* make_bad_inode will initialize iP so that all operations return EIO;
     also set i_nlink to zero so that the bad inode will be thrown out of
     the cache at the next opportunity */
  make_bad_inode(iP);
  iP->i_nlink = 0;
}

/* The following routine should never be called, since we have a read_inode2
   operation.  However, knfsd checks the operation table and refuses to export
   a file system if its read_inode operation ptr is NULL.  Hence, we need to
   have one, even if it never gets called. */
void
gpfs_s_read_inode(struct inode *iP)
{
  TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_READINODE_HUH,
         "gpfs_s_read_inode ?: calling make_bad_inode");
  make_bad_inode(iP);
}


/* The following routine is called from iput when the i_count goes to zero and
   the link count in the inode is zero, which presumably means that the file
   was deleted.  If so, we should free the disk space occupied by the file. */
void
gpfs_s_delete_inode(struct inode *iP)
{
  cxiNode_t *cnP;
  ext_cred_t eCred;
  Boolean isGPFS = cxiIsGPFSThread();
  struct gpfsVfsData_t *privVfsP;

  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_DELETE_INODE,
        "gpfs_s_delete_inode enter: inode 0x%lX inode %d gpfsThread %d\n",
	 iP, iP->i_ino, isGPFS);
  TraceBKL();

  cnP = VP_TO_CNP(iP);

  if (!cnP) 
  {
    /* The cxiNode_t is allocated in gpfs_s_read_inode2, so if cnP is NULL,
       this means gpfs_s_read_inode2 failed and has marked this as a bad
       inode.  No further actions necessary in this case. */
    goto xerror;
  }

  if (cnP->destroyIfDelInode)
  {
    privVfsP = VP_TO_PVP(iP);
    DBGASSERT(privVfsP != NULL);

    /* ?? "eCred is passed all the way to the daemon, and then is ignored
       there," FBS 5/24/01 */
    setCred(&eCred);

    gpfs_ops.gpfsInodeDelete(privVfsP, cnP, isGPFS, &eCred);
  }

xerror:
  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_DELETE_INODE_1,
         "gpfs_s_delete_inode inode 0x%lX cnP 0x%lX\n",
         iP, cnP);

  clear_inode(iP);
}

int 
gpfs_s_notify_change(struct dentry *dentryP, struct iattr *attrP)
{
  int rc;
  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_NOTIFY_ENTER,
         "gpfs_s_notify_change enter: inode 0x%lX attr 0x%lX\n",
         dentryP->d_inode, attrP);
  TraceBKL();

  rc = gpfs_i_setattr_internal(dentryP->d_inode, attrP);

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_NOTIFY_EXIT,
         "gpfs_s_notify_change exit: inode 0x%lX rc %d\n",
         dentryP->d_inode, rc);
  if (rc)
    return (-rc);
  return rc;
}

/* put_super is called just before the super_block is freed in do_unmount */
void 
gpfs_s_put_super(struct super_block * sbP)
{
  int rc = 0;

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_PUTSUPER_ENTER,
         "gpfs_s_put_super enter: sbP 0x%lX sbP->s_dev 0x%X\n",
         sbP, sbP->s_dev);
  TraceBKL();

  unmountInternal(sbP, false);

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_PUTSUPER_EXIT,
         "gpfs_s_put_super exit: rc %d\n", rc);

  if (MOD_IN_USE)
    MOD_DEC_USE_COUNT;
  return;
}

int 
gpfs_s_statfs(struct super_block *sbP, struct statfs *bufP)
{
  int rc;
  int code = 0;
  int len = sizeof(struct statfs);
  struct gpfsVfsData_t *privVfsP = (struct gpfsVfsData_t *)sbP->u.generic_sbp;
  cxiStatfs_t statfs; 

  VFS_STAT_START(statfsCall);
  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_STATFS_ENTER,
         "gpfs_s_statfs enter: sbP 0x%lX len %d\n", sbP, len);
  memset(bufP, 0, len);
  /* BKL is held at entry */

  rc = gpfs_ops.gpfsStatfs(privVfsP, &statfs);
  if (rc)
  {
    rc = -rc;
    code = 1;
    goto xerror;
  }

  bufP->f_type = GPFS_SUPER_MAGIC;  // ???
  bufP->f_bsize = statfs.f_bsize;
  bufP->f_blocks = statfs.f_blocks;
  bufP->f_bfree = statfs.f_bfree;
  bufP->f_bavail = statfs.f_bavail;
  bufP->f_files = statfs.f_files;
  bufP->f_ffree = statfs.f_ffree; 
  bufP->f_namelen = statfs.f_name_max;
  bufP->f_fsid.val[0] = statfs.f_fsid.val[0];
  bufP->f_fsid.val[1] = statfs.f_fsid.val[1];

xerror:
  TRACE6(TRACE_VNODE, 1, TRCID_LINUXOPS_STATFS_EXIT,
         "gpfs_s_statfs exit: f_blocks %d f_bfree %d f_files %d f_free %d "
         "code %d rc %d\n", bufP->f_blocks, bufP->f_bfree, bufP->f_files,
         bufP->f_ffree, code, rc);
  VFS_STAT_STOP;
  return rc;
}

/* umount_begin is called only when the force option is used */
void 
gpfs_s_umount_begin(struct super_block * sbP)
{
  int rc = 0;

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_UMOUNT_ENTER,
         "gpfs_s_umount_begin enter: sbP 0x%lX sbP->s_dev 0x%X\n",
         sbP, sbP->s_dev);
  TraceBKL();

  unmountInternal(sbP, true);

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_UMOUNT_EXIT,
         "gpfs_s_umount_begin exit: rc %d\n", rc);

  /* Module count is decremented later on in do_unmount via gpfs_s_put_super */
  return;
}

int 
gpfs_s_remount(struct super_block *sbP, int *flags, char *data)
{
  TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_REMOUNT,
         "gpfs_s_remount: called\n");
  TraceBKL();
  return 0;
}

void 
gpfs_s_write_inode(struct inode *inode)
{
  TRACE0(TRACE_VNODE, 1, TRCID_LINUXOPS_WRITEINODE,
         "gpfs_s_write_inode: called\n");
  TraceBKL();
  return;
}


/* This routine is called from iput() just before the storage of the
   Linux inode is freed */
void
gpfs_s_clear_inode(struct inode *iP)
{
  int code = 0;
  struct gpfsVfsData_t *privVfsP;
  cxiNode_t *cnP; 

  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_CLEARINODE,
         "gpfs_s_clear_inode enter: inode 0x%lX inode %d generic_ip 0x%lX\n",
         iP, iP->i_ino, iP->u.generic_ip);
  TRACE3(TRACE_VNODE, 5, TRCID_LINUXOPS_CLEARINODE_DETAILS, 
         "gpfs_s_clear_inode enter: cnP 0x%lX privVfsP 0x%lX tooBig %d\n",
         VP_TO_CNP(iP), VP_TO_PVP(iP), 
         NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE));

  if (iP->u.generic_ip)
  {
    cnP = VP_TO_CNP(iP);
    privVfsP = VP_TO_PVP(iP);
    if (privVfsP == NULL)
    {
      code = 1;
      goto xerror;
    }

    DBGASSERT(atomic_read((atomic_t *)&iP->i_count) == 0);
    gpfs_ops.gpfsRele(privVfsP, cnP, (void *)iP, vnOp);

    /* if necessary, free the cxiNode_t storage that we allocated in
       gpfs_s_read_inode2 */
    if (NODE_TOO_BIG_FOR_OSNODE(CXINODE_SIZE))
      cxiFreeUnpinned(cnP);
  }

xerror:
  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_CLEARINODE_EXIT,
         "gpfs_s_clear_inode exit: inode 0x%lX generic_ip 0x%lX code %d\n",
         iP, iP->u.generic_ip, code);
  return;
}

void 
gpfs_s_write_super(struct super_block * sbP)
{
  int rc = 0;
  struct gpfsVfsData_t *privVfsP = (struct gpfsVfsData_t *)sbP->u.generic_sbp;

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_WRITESUPER,
         "gpfs_s_write_super: enter: sbP 0x%lX\n", sbP);

#if LINUX_KERNEL_VERSION >= 2040900
  /* We have to either adhere to the s_dirt semantics or
   * ignore all syncs.  Once a file systems write_super gets 
   * called, sync_supers() restarts the super block scan.  If 
   * we don't turn off s_dirt then sync_supers() will be caught
   * in a loop.  Alternatively if we only ignored kupdated then
   *
   * 1) a person could write to a file (which turns on s_dirt)
   * 2) kupdated could run (and be ignored) but the s_dirt is turned off
   * 3) the user attempts a sync from the command line sync, but that
   *    does nothing since s_dirt was off
   * 4) the user expected the sync to have done something before he 
   *    halts the machine.
   */
  sbP->s_dirt = 0;
#else
  /* By default linux calls this entry point every 5 seconds.  We can 
   * either ignore kupdate or the user can set the kupdate interval
   * to a larger value.  Argument 5 of the bdflush file controls
   * the interval.  Here's an example for changing it to 600 seconds
   *
   * cat /proc/sys/vm/bdflush | awk ' { print $1" "$2" "$3" "$4" "60000 } ' > \
   *     /proc/sys/vm/bdflush
   */
  if (cxiIsKupdateThread())
    goto xerror;
#endif

  /* BKL is held at entry */
  TRACE0(TRACE_VNODE, 3, TRCID_LINUXOPS_WRITESUPER_3,
         "gpfs_s_write_super: performing sync");

  rc = gpfs_ops.gpfsSyncfs(privVfsP);
  if (rc)
    rc = -rc;

xerror:
  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_WRITESUPER_5,
         "gpfs_s_write_super exit: sbP 0x%lX rc %d\n", sbP, rc);
}

struct super_block *
gpfs_read_super(struct super_block *sbP, void *dataP, int silent)
{
  int kernel_unlock = 0;
  struct inode *rootIP = NULL;
  struct dentry *rootDP = NULL;
  char *myBufP = NULL;
  char *sgNameP;
  char *strP;
  char *mountpointP;
  char *optionsP;
  int rc = 0;
  int mountHelperID = -1;
  int code = 0;
  int namelen;
  struct gpfsVfsData_t *privVfsP;
  cxiNode_t *cnRootP;
  cxiIno_t rootINum;

  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_1,
         "gpfs_read_super enter: sbP 0x%lX dev 0x%X silent %d data '%s'\n",
         sbP, sbP->s_dev, silent, ((char *)dataP == NULL) ? "" : dataP);
  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_2,
         "gpfs_read_super: dev name '%s'\n", bdevname(sbP->s_dev));
  /* BKL is held at entry */

  if (dataP == NULL)
  {
    rc = EINVAL;
    code = 1;
    goto xerror;
  }
   
  if (strlen((char *)dataP) > CXI_MAXPATHLEN)
  {
    rc = ENAMETOOLONG;
    code = 2;
    goto xerror;
  }

  MOD_INC_USE_COUNT;

  sbP->s_magic = GPFS_SUPER_MAGIC;
  sbP->s_op = &gpfs_sops;
  sbP->u.generic_sbp = NULL; /* set up for error */
  sbP->s_root = NULL;
  sbP->s_blocksize = 0;
  sbP->s_blocksize_bits = 0;
  sbP->s_maxbytes = ~0ULL; /* maximum filesize */

  myBufP = (char *)cxiMallocPinned(strlen((char *)dataP) + 1);
  if (myBufP == NULL)
  {
    code = 3;
    rc = ENOMEM;
    goto xerror;
  }
  strcpy(myBufP, (char *)dataP);
  optionsP = myBufP;

#define OLD_LINUX_PROTO
#ifdef OLD_LINUX_PROTO
  /* This is the old syntax where the options field contained
   * a first option of the device name followed by a colon,
   * then more options.
   */
  sgNameP = myBufP;
  strP = (char *)strchr(sgNameP, ':'); 
  if (strP != NULL)
  {
    *strP = '\0';
    optionsP = strP + 1;
  }
  else
  {
#endif
  /* This is the syntax parser for the options field.  At 
   * least one option must be "dev=<devname>".
   */
  sgNameP = NULL;
  strP = myBufP;

  while(strP)
  {
    if (!strncmp(strP, "dev=", 4))
    {
      sgNameP = (char *)strchr(strP, '=') + 1;
      strP = (char *)strchr(strP, ','); /* more options */
      if (strP)
        namelen = strP - sgNameP;
      else
        namelen = strlen(sgNameP);

      /* Copy the sgName into the first part of the 
       * buffer, null terminate it, then append the 
       * full option list.
       */
      strncpy(myBufP, sgNameP, namelen);
      sgNameP = myBufP;
      sgNameP[namelen] = '\0';

      optionsP = myBufP + namelen + 1;
      /* Move the options next (if there are any) */
      strcpy(optionsP, strP?(char *)strP:"");
      break;
    }
    else
    {
      strP = (char *)strchr(strP, ',');
      if (strP) strP++;
    }
  }
#ifdef OLD_LINUX_PROTO
  }
  if (sgNameP == NULL)
  {
    sgNameP = myBufP;
    optionsP = "";
  }
#endif
    
  if (sgNameP == NULL)
  {
    code = 4;
    rc = EINVAL;
    goto xerror;
  }
  mountpointP = sgNameP;  /* ??? */

  strcpy(mmfs_path, bin_path);
  strcat(mmfs_path, "/mmfsmount");

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_STARTHELPER,
         "gpfs_read_super: start mount helper '%s'\n", mmfs_path);

  if (strlen(sgNameP) > CXI_MAXPATHLEN)
  {
    rc = ENAMETOOLONG;
    code = 5;
    goto xerror;
  }
  rc = gpfs_ops.gpfsReady();
  if (rc != 0)
  {
    rc = EAGAIN;
    code = 6;
    goto xerror;
  }

  /* Start a new process that will receive and forward all messages during the
   * mount process to the mount invoker. The current process will wait for
   * this new process (in HandleMBUnmount()) and the daemon to be connected with
   * a socket and only than call SFSMountFS() that does the real mount work.
   */
  strcpy(&mountCmd[2], sgNameP);               // "M /dev/gpfs1"
  mountHelperID = fork_mount_helper(mountCmd);

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_HELPERRC,
         "gpfs_read_super: mount helper mountHelperID %d\n", mountHelperID);

  if (kernel_locked())
  {
    unlock_kernel();
    kernel_unlock = 1;
  }
  rc = gpfs_ops.gpfsMount((void *)sbP, PAGE_SIZE, sgNameP, mountpointP,
                 optionsP,
                 (struct gpfsVfsData_t **)&(sbP->u.generic_sbp),
                 &cnRootP,      /* returned root cxiNode_t */
                 &rootINum,     /* returned root inode number */
                 NULL,          /* not a soft mount */
                 mountHelperID  /* mount helper id */,
                 -1U,           /* no unique mount ID specified */
                 (sbP->s_flags & MS_RDONLY), /* is it readonly */
                 true);   /* allocate pinned memory */

  if (kernel_unlock)
    lock_kernel();

  if (rc)
  {
    code = 7;
    goto xerror;
  }

  privVfsP = (struct gpfsVfsData_t *)sbP->u.generic_sbp;
  DBGASSERT(cnRootP != NULL);
  rootIP = (struct inode *)cnRootP->osNodeP;

  DBGASSERT(rootIP != NULL);
  DBGASSERT(rootIP->u.generic_ip == cnRootP);
  DBGASSERT(cnRootP->osNodeP == rootIP);

  /* Successful mount in daemon.  Allocate root directory cache entry */
  rootDP = d_alloc_root(rootIP);
  if (!rootDP)
  {
    rc = gpfs_ops.gpfsUnmount(privVfsP, true);
    if (rc == 0 || rc == ENOSYS)
      gpfs_ops.gpfsFinishUnmount(privVfsP);

    code = 8;
    goto xerror;
  }

  rootDP->d_op = &gpfs_dops_valid;
  sbP->s_root = rootDP;

  sbP->s_dirt = 1;            /* keep it on for sync to work */

  if (myBufP != NULL)
    cxiFreePinned(myBufP);

  unlock_super(sbP);

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_SUCCEED,
         "gpfs_read_super exit: success sbP 0x%lX\n", sbP);
  return sbP;

xerror:
  if (rootDP)
    dput(rootDP);
  if (rootIP)
    iput(rootIP);

  if (myBufP != NULL)
    cxiFreePinned(myBufP);

  unlock_super(sbP);

  sbP->s_dev = 0;
  if (MOD_IN_USE)
    MOD_DEC_USE_COUNT;

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_READSUPER_FAILED,
         "gpfs_read_super: failed code %d rc %d\n", code, rc);
  return NULL;
}

int
gpfs_reg_fs()
{
  int rc;
  rc = register_filesystem(&gpfs_fs_type);

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_REGFS,
         "gpfs_reg_fs rc %d\n", rc);

  return rc;
}

void
gpfs_unreg_fs()
{
  int rc;
  rc = unregister_filesystem(&gpfs_fs_type);

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_UNREGFS,
         "gpfs_unreg_fs rc %d\n", rc);
  return;
}

int 
mmfsd_release(struct inode *inode, struct file *filp)
{
  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_MMFSD_RELEASE_1,
         "mmfsd_release: prog has exited inode %lX filp %lX pid %X "
         "inode_pid %lX\n", inode, filp, current->pid, inode->u.generic_ip);

  if (mmfsd_id == current->pid || mmfsd_id == PTR_TO_INT32(inode->u.generic_ip))
  {
    TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_MMFSD_RELEASE_2,
           "mmfsd_release: mmfsd exited pid %X\n", mmfsd_id);
    mmfsd_id = -1;
    wake_up(&pwq);
  }
  if (mount_id == current->pid || mmfsd_id == PTR_TO_INT32(inode->u.generic_ip))
  {
    TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_MMFSD_RELEASE_3,
           "mmfsd_release: mount exited pid %X\n", mount_id);
    mount_id = -1;
  }
  if (MOD_IN_USE)
    MOD_DEC_USE_COUNT;   // ss_release is called too late to do the decrement

  return 0;
}

void
kill_mmfsd(void)
{
  if (mmfsd_id != -1)
  {
    TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_KILLMMFSD,
          "kill_mmfsd: pid %X\n", mmfsd_id);

    kill(mmfsd_id, 15 /*SIGTERM*/);
    if (mmfsd_id != -1)
      sleep_on(&pwq);
  }
}

static unsigned int unusedInodeNum = 1;
static struct inode *unusedInodeP = NULL;
static struct super_block *unusedSuperP = NULL;

/* This method will be called by kswapd during prune_icache and 
 * thus has special blocking considerations.  You cannot block
 * needing memory resources nor block on a mutex that may be held
 * by another thread requiring memory resources.
 */
void 
swapd_s_clear_inode(struct inode *iP)
{
  /* special non blocking trace record */
  TRACE5N(TRACE_VNODE, 1, TRCID_SWAPD_CLEARINODE,
          "swapd_s_clear_inode enter: iP 0x%lX ino %d unusedInode 0x%lX "
          "unusedSuper 0x%lX unusedInodeNum %d\n", iP, iP->i_ino, unusedInodeP,
          unusedSuperP, unusedInodeNum);

  DBGASSERT(iP == unusedInodeP);
  DBGASSERT(iP->i_sb == unusedSuperP);
  DBGASSERT(iP->i_ino == unusedInodeNum);

  cxiFreePinned(iP->i_sb);

  unusedInodeNum++; /* increment tells us how many times we've been released */
  unusedInodeP = NULL;
  unusedSuperP = NULL;

  iP->i_sb = NULL;
  iP->i_nlink = 0;

  /* Notify gpfs swapd */
  gpfs_ops.gpfsSwapdSignal(false);
}

void
swapd_s_read_inode(struct inode *iP)
{
  TRACE5(TRACE_VNODE, 1, TRCID_SWAPD_READINODE,
         "swapd_s_read_inode enter: iP 0x%lX ino %d unusedInode 0x%lX "
         "unusedSuper 0x%lX unusedInodeNum %d\n", iP, iP->i_ino, unusedInodeP,
         unusedSuperP, unusedInodeNum);
}

void
cxiRemoveOSNodeUnused()
{
  struct inode *iP = unusedInodeP;
  struct super_block *sbP = unusedSuperP;

  if (iP)
  {
    iP = igrab(iP);
    if (iP)
    {
      DBGASSERT(iP == unusedInodeP);
      DBGASSERT(iP->i_sb == sbP);
      DBGASSERT(iP->i_ino == unusedInodeNum);
 
      iP->i_nlink = 0;
      iput(iP);
    }
  }
  return;
}
    
/* Add an inode to the unused inode list (i_nlink == 1 && i_count == 0)
 * This guy will be reclaimed if kswapd runs.  The invocation of 
 * swapd_s_clear_inode will cause our swapd to run.
 */
int
cxiAddOSNodeUnused()
{
  int rc = 0;
  struct inode *iP = unusedInodeP;
  struct super_block *sbP = unusedSuperP;

  if (iP)
   goto xerror;
  
  sbP = unusedSuperP = cxiMallocPinned(sizeof(struct super_block));
  if (!sbP)
  {
    rc = ENOMEM;
    goto xerror;
  }
  
  memset(sbP, 0, sizeof(struct super_block));
  sbP->s_op = &swapd_sops;
#if LINUX_KERNEL_VERSION >= 2041800
  sbP->s_flags |= MS_ACTIVE;
#endif

  iP = unusedInodeP = iget(sbP, unusedInodeNum);
  if (!iP)
  {
    cxiFreePinned(sbP);
    unusedSuperP = NULL;
    rc = ENOMEM;
    goto xerror;
  }

  /* i_nlink is 1 so this inode will remain on the unused inode list
   * to be potentially reclaimed by kswapd.
   */
  iput(iP);

xerror:
  TRACE4(TRACE_VNODE, 1, TRCID_ADDOSNODE_UNUSED,
         "cxiAddOSNodeUnused: iP 0x%lX sbP 0x%lX ino %d rc %d\n",
         iP, sbP, unusedInodeNum, rc);
  return rc;
}


struct inode *
get_myinode(void)
{
  struct inode *inode = get_empty_inode();

  if (!inode)
    return 0;
  inode->i_op = &daemon_inodeops;
  inode->i_fop = &daemon_fops;
  inode->i_state = I_DIRTY;
  inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
  inode->i_uid = current->fsuid;
  inode->i_gid = current->fsgid;
  inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  inode->i_blksize = PAGE_SIZE;
  inode->u.generic_ip = INT32_TO_PTR(current->pid);  // save proc id
  return inode;
}

int
exec_mmfs(void *nothing)
{
  int rc = 0;
  int code = 0;
  static char *argv[] = { mmfs_path, mount_opt, NULL };
  static char *envp[] = { "HOME=/", NULL };
  int i,j,k;
  struct file *file;
  struct inode *inode;

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_EXECMMFS_1,
         "exec_mmfs: mmfs_path %s mount_opt %s\n",
         mmfs_path, mount_opt);

  set_fs(KERNEL_DS);

#if 0
  close(0); close(1); close(2); // Just used to reserve the space
#endif

  file = get_empty_filp();
  if (!file)
  {
    code = 1;
    rc = -1;
    goto xerror;
  }
  inode = get_myinode();
  if (!inode)
  {
    code = 2;
    rc = -1;
    goto xerror;
  }

#if 0
  i = get_unused_fd();
  j = get_unused_fd();
  k = get_unused_fd();
  if (i!=0 || j != 1 || k != 1)
  {
    code = 3;
    rc = -1;
    goto xerror;
  }
#endif

  file->f_dentry = dget(d_alloc_root(inode));
  if (!file->f_dentry)
  {
    code = 4;
    rc = -1;
    goto xerror;
  }
  file->f_pos = 0;
  file->f_flags = O_RDWR;
  file->f_op = &daemon_fops;
  file->f_mode = 3;

#if 0
  fd_install(i,file);
  fd_install(j,file);
  fd_install(k,file);
#endif

  execve(mmfs_path, argv, envp);
  rc = -1;

xerror:
  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_EXECMMFS_EXIT,
         "exec_mmfs: exit rc %d code %d errno %d path %s\n",
         rc, code, errno, mmfs_path);
  return rc;
}

int
fork_mount_helper(char *data)
{
  strcpy(mount_opt, data);
  mount_id = kernel_thread(exec_mmfs, 0, 0);

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_FORK_MOUNTHELPER,
        "fork_mount_helper: new pid %X\n", mount_id);

  return mount_id;
}

/* Set device id and other information for a file system being mounted */
int
cxiSetMountInfo(void *osVfsP, cxiDev_t sgDevID,
                int bsize, void *osRootNodeP, cxiNode_t *cnRootP,
                Boolean *releRootP, void *gnRootP)/*(out)maintain hold on root*/
{
  struct super_block *sbP = (struct super_block *)osVfsP;
  struct inode *rootIP = (struct inode *)osRootNodeP; // root dir inode
  int i;

  TRACE4(TRACE_VNODE, 1, TRCID_SET_MOUNT_INFO,
         "cxiSetMountInfo: sbP 0x%lX rootIP 0x%lX cnRootP 0x%lX gnRootP 0x%lX\n",
         sbP, rootIP, cnRootP, gnRootP);
  DBGASSERT(sbP != NULL);

  /* This is the auto remount case where mmfsd died/killed and restarted. */
  if (gnRootP == cnRootP)
  {
    /* Since the OS independent layer looked up and held the
     * root vnode, we've got too many hold counts for a reconnect.
     * Tell upper layer that we must release.
     */
    *releRootP = true;
  }
  else
  {
    /* Don't attempt to release the root VFS node */
    *releRootP = false;
    sbP->s_blocksize = bsize;
    for (i = sbP->s_blocksize, sbP->s_blocksize_bits = 0; i != 1; i >>= 1)
      sbP->s_blocksize_bits++;
  }
  if (rootIP != NULL)
  {
    DBGASSERT(rootIP->i_ino == INODENUM_ROOTDIR_FILE);
    DBGASSERT(rootIP->u.generic_ip == cnRootP);
  }

  return 0;
}

/*
 * Refer to vfs_unmount in the file system operations section of the manual
 * "Calls and Subroutines Reference: kernel"
 *
 * Check that it is ok, release the last VFS node, and release the vfs entry
 */
int
unmountInternal(struct super_block *sbP, Boolean force)
{
  int rc = 0;
  struct gpfsVfsData_t *privVfsP;

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_UNMOUNTINT_ENTER,
         "unmountInternal enter: sbP 0x%lX force %d\n", sbP, force);

  privVfsP = (struct gpfsVfsData_t *)sbP->u.generic_sbp;

  /* Purge cached OS VFS nodes/cxiNodes. */
  gpfs_ops.gpfsUncache(privVfsP);

  if (privVfsP != NULL)
  {
    rc = gpfs_ops.gpfsUnmount(privVfsP, force);
    if (rc == ENOSYS)
      rc = 0;
    if (rc == 0 && !force)  /* force unmount should follow with regular unmount
                               which will finish the unmount. */
    {
      gpfs_ops.gpfsFinishUnmount(privVfsP);
      sbP->u.generic_sbp = NULL;
    }
  }
  sbP->s_dirt = 0;

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_UNMOUNTINT_EXIT,
         "unmountInternal exit: rc %d\n", rc);
  return rc;
}

int
cxiReactivateOSNode(void *osVfsP, cxiNode_t *cnP, void **osNodePP)
{
  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_REACTIVATE_VNODE,
         "cxiReactivateOSNode: sbP 0x%lX cxiNodeP 0x%lX osNodePP 0x%lX\n",
         osVfsP, cnP, osNodePP);
  LOGASSERT(0);   // not implemented on linux
  return 0;
}


int
cxiNewOSNode(void *osVfsP, cxiNode_t **cnPP, void **osNodePP,
             cxiIno_t inum, int nodeSize, void *opaqueP)
{
  struct super_block *sbP = (struct super_block *)osVfsP;
  struct inode *iP;
  int rc;

  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_NEW_VNODE,
         "cxiNewOSNode: sbP 0x%lX inum %d size %d",
         sbP, inum, nodeSize);

  /* The requested nodeSize must match CXINODE_SIZE */
  if (nodeSize != CXINODE_SIZE)
    goto bad_node_size;

  iP = iget4(sbP, inum, NULL, opaqueP);
  if (iP == NULL)
  {
    *cnPP = NULL;
    *osNodePP = NULL;
    rc = ENOMEM;
  }
  else if (is_bad_inode(iP))
  {
    *cnPP = NULL;
    *osNodePP = NULL;
    iput(iP);
    rc = EIO;
  }
  else
  {
    DBGASSERT(iP->u.generic_ip != NULL);
    *cnPP = (cxiNode_t *)iP->u.generic_ip;
    *osNodePP = iP;
    rc = 0;
  }

  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_NEW_VNODE_EXIT,
         "cxiNewOSNode: exit osNodeP 0x%lX cnP 0x%lX rc %d\n",
         *osNodePP, *cnPP, rc);
  return rc;

bad_node_size:
  /* The requested nodeSize does not match CXINODE_SIZE.
     Whoever called us is an incompitble version of the code or was
     somehow not compiled correctly. */
  TRACE2(TRACE_VNODE, 2, TRCID_LINUXOPS_NEW_VNODE_BAD,
         "cxiNewOSNode: requested nodeSize %d does not match CXINODE_SIZE %d",
         nodeSize, CXINODE_SIZE);
  printk("mmfs: module inconsistency detected in cxiNewOSNode:\n"
         "      requested nodeSize %d does not match CXINODE_SIZE %d\n",
         nodeSize, CXINODE_SIZE);
  LOGASSERT(!"nodeSize != CXINODE_SIZE");
  return ELIBBAD;
}


/* The linux kernel decrements the inode count and deallocates the
 * inode after gpfs_s_put_inode() is called therefore this routine 
 * doesn't perform a delete.
 */
void
cxiFreeOSNode(void *osVfsP, struct cxiNode_t *cnP, void *osNodeP)
{
  struct super_block *sbP = (struct super_block *)osVfsP;
  struct inode *iP = (struct inode *)osNodeP;

  TRACE5(TRACE_VNODE, 2, TRCID_LINUXOPS_DELETE_VNODE,
         "cxiFreeOSNode enter: sbP 0x%lX cxiNodeP 0x%lX "
         "iP 0x%lX inode %d i_count %d\n",
         sbP, cnP, iP,
         iP ? iP->i_ino : -1,
         iP ? atomic_read((atomic_t *)&iP->i_count) : 0);

  DBGASSERT(cnP->osNodeP == iP);
  cnP->osNodeP = NULL;

  if (iP)
  {
    DBGASSERT(atomic_read((atomic_t *)&iP->i_count) == 0);
    iP->u.generic_ip = NULL;
    iP->i_op = NULL;
    iP->i_fop = NULL;
    if (iP->i_mapping)
      iP->i_mapping->a_ops = &gpfs_aops_after_inode_delete;
    iP->i_size = 0;
    iP->i_nlink = 0;
  }
}

void
cxiDeleteMmap(cxiVmid_t segid)
{
  TRACE1(TRACE_VNODE, 2, TRCID_LINUXOPS_DELETE_MMAP,
         "cxiDeleteMmap: segid 0x%X\n", segid);
}

void
cxiReinitOSNode(void *osVfsP, struct cxiNode_t *cnP, void *osNodeP)
{
  struct super_block *sbP = (struct super_block *)osVfsP;
  struct inode *iP = (struct inode *)osNodeP;

  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_REINIT_VNODE,
         "cxiReinitOSNode: sbP 0x%lX cnP 0x%lX iP 0x%lX\n",
         sbP, cnP, iP);
  LOGASSERT(0);   // not implemented on linux
}

/* Return Linux inode (as void pointer) correlating to cxiNode_t */
int
cxiFindOSNode(void *osVfsP, cxiNode_t *cnP, void **osNodePP,
              Boolean holdit)  /* in - add reference to node */
{
  struct super_block *sbP = (struct super_block *)osVfsP;
  struct inode *iP = (struct inode *)cnP->osNodeP;
  struct inode *riP;
  int rc = 0;

  *osNodePP = iP;

  if (holdit)
  {
    DBGASSERT(iP != NULL);
    riP = igrab(iP);

    /* Potentially someone could be attempting to free this inode
     * and the igrab() would fail.  If a free were occurring another
     * thread would call gpfs_s_clear_inode() and may in fact be
     * waiting on the gpfsNode cache lock we hold.
     */
    if (riP == NULL)
      rc = ENOENT;
  }

  TRACE6(TRACE_VNODE, 2, TRCID_LINUXOPS_FIND_VNODE,
         "cxiFindOSNode: sbP 0x%lX cxiNodeP 0x%lX "
         "iP 0x%lX i_count %d holdit %d rc %d\n", sbP, cnP,
         *osNodePP, iP ? atomic_read((atomic_t *)&iP->i_count) : 0, holdit, rc);

  return rc;
}

void
cxiDumpOSNode(cxiNode_t *cnP)
{
  struct inode *iP = (struct inode *)cnP->osNodeP;
  struct list_head *dListP, *dHeadP;
  struct dentry *dentry;

  TRACE2(TRACE_VNODE, 2, TRCID_LINUXOPS_DUMP_VNODE,
         "cxiDumpOSNode: cxiNodeP 0x%lX iP 0x%lX\n", cnP, iP);
  if (iP)
  {
    printInode(iP);

    dHeadP = &iP->i_dentry;
    spin_lock(&dcache_lock);
    for (dListP = dHeadP->next; dListP != dHeadP; dListP = dListP->next)
    {
      dentry = list_entry(dListP, struct dentry, d_alias);
      printDentry(dentry);
    }
    spin_unlock(&dcache_lock);
  }
}

/* On linux we can't just decrement the i_count
 * thus this routine will only accept a positive
 * increment.  If you want to put a reference then
 * call cxiPutOSNode() which calls back thru the VFS
 * layer.
 */
int
cxiRefOSNode(void *osVfsP, cxiNode_t *cnP, void *osNodeP, int inc)
{
  struct inode *iP = (struct inode *)osNodeP;
  struct inode *riP;
  int holdCount;
  int ino;

  DBGASSERT(iP != NULL);
  DBGASSERT(inc == 1);

  /* The igrab() may fail if this inode is actively going
   * thru a release.
   */
  riP = igrab(iP);
  if (riP)
  {
    holdCount = atomic_read((atomic_t *)&riP->i_count);
    ino = riP->i_ino;
  }
  else
  {
    holdCount = 0;
    ino = -1;
  }
  TRACE5(TRACE_VNODE, 2, TRCID_LINUXOPS_REF_VNODE,
         "cxiRefOSNode exit: sbP 0x%lX cxiNodeP 0x%lX iP 0x%lX inode %d i_count to %d",
         osVfsP, cnP, iP, ino, holdCount);
  return holdCount;
}
  
/* Determines if OS node is inactive */
int 
cxiInactiveOSNode(void *osVfsP, struct cxiNode_t *cnP, void *osNodeP, 
                 Boolean *canCacheP, Boolean *hasReferencesP)
{
  struct inode *iP = (struct inode *)osNodeP;
  struct super_block *sbP = (struct super_block *)osVfsP;
  int holdCount;

  DBGASSERT(cnP->osNodeP == iP);

  *canCacheP = false;
  *hasReferencesP = false;

  holdCount = atomic_read((atomic_t *)&iP->i_count);
  if (holdCount > 0)
    *hasReferencesP = true;

  TRACE6(TRACE_VNODE, 2, TRCID_LINUXOPS_INACTIVE_VNODE,
         "cxiInactiveOSNode: sbP 0x%lX cxiNodeP 0x%lX iP 0x%lX "
         "i_count %d canCache %d hasReferences %d\n", sbP, cnP, iP, 
         holdCount, *canCacheP, *hasReferencesP);

  return holdCount;
}

void
cxiPutOSNode(void *vP)
{
  struct inode *iP = (struct inode *)vP;
  int holdCount;

  DBGASSERT(iP != NULL);
  holdCount = atomic_read((atomic_t *)&iP->i_count);
  DBGASSERT(holdCount > 0);

  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_PUT_OSNODE,
         "cxiPutOSNode enter: iP 0x%lX inode %d i_count to %d\n",
         iP, iP->i_ino, holdCount-1);

  iput(iP);

  return;
}

void
cxiDestroyOSNode(void *vP)
{
  struct inode *iP = (struct inode *)vP;
  int holdCount;

  DBGASSERT(iP != NULL);
  holdCount = atomic_read((atomic_t *)&iP->i_count);
  DBGASSERT(holdCount > 0);

  TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_DESTROY_OSNODE,
         "cxiDestroyOSNode enter: iP 0x%lX inode %d i_count %d i_nlink %d\n",
         iP, iP->i_ino, holdCount, iP->i_nlink);

  iP->i_nlink = 0;
  return;
}

void
cxiSetOSNodeType(struct cxiNode_t *cnP, cxiMode_t mode, cxiDev_t dev)
{
  if (S_ISDIR(mode))
    cnP->nType = cxiVDIR;
  else if (S_ISREG(mode))
    cnP->nType = cxiVREG;
  else if (S_ISLNK(mode))
    cnP->nType = cxiVLNK;
  else if (S_ISCHR(mode))
    cnP->nType = cxiVCHR;
  else if (S_ISBLK(mode))
    cnP->nType = cxiVBLK;
  else if (S_ISFIFO(mode))
    cnP->nType = cxiVFIFO;
  else if (S_ISSOCK(mode))
    cnP->nType = cxiVSOCK;
  else
    DBGASSERT(0);
}

spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;

void
cxiUpdateInode(cxiNode_t *cnP, cxiVattr_t *attrP, int what)
{
  struct inode *iP = (struct inode *)cnP->osNodeP;

  if (iP != NULL)
  {
    if (what & CXIUP_ATIME)
    {
      iP->i_atime = attrP->va_atime.tv_sec;
      return;
    }
    if (what & CXIUP_MODE)
    {
      iP->i_mode = attrP->va_mode;
      iP->i_ctime = attrP->va_ctime.tv_sec;
    }
    if (what & CXIUP_OWN)
    {
      iP->i_mode = attrP->va_mode;
      iP->i_uid  = attrP->va_uid;
      iP->i_gid  = attrP->va_gid;
      iP->i_ctime = attrP->va_ctime.tv_sec;
    }
    if (what & CXIUP_NLINK)
    {
      iP->i_nlink = attrP->va_nlink;
    }
    if (what & CXIUP_SIZE)
    {
      iP->i_size = attrP->va_size;
      iP->i_blocks = attrP->va_blocks;
    }
    if (what & CXIUP_SIZE_BIG)
    {
      spin_lock(&inode_lock);
      if (attrP->va_size > iP->i_size)
      {
        iP->i_size = attrP->va_size;
        iP->i_blocks = attrP->va_blocks;
      }
      spin_unlock(&inode_lock);
    }
    if (what & CXIUP_TIMES)
    {
      iP->i_atime = attrP->va_atime.tv_sec;
      iP->i_mtime = attrP->va_mtime.tv_sec;
      iP->i_ctime = attrP->va_ctime.tv_sec;
    }
    if (what & CXIUP_PERM)
    {
      iP->i_mode = attrP->va_mode;
      iP->i_uid  = attrP->va_uid;
      iP->i_gid  = attrP->va_gid;
      cnP->xinfo = attrP->va_xinfo;
      cnP->icValid |= CXI_IC_PERM;
      if (S_ISREG(iP->i_mode) | S_ISLNK(iP->i_mode))
        iP->i_op = (attrP->va_xinfo & VA_XPERM) ? 
          &gpfs_iops_xperm: &gpfs_iops_stdperm;
      else if (S_ISDIR(iP->i_mode))
        iP->i_op = (attrP->va_xinfo & VA_XPERM) ? 
          &gpfs_dir_iops_xperm: &gpfs_dir_iops_stdperm;
    }
    if ((what & CXIUP_NLINK) && cnP->destroyIfDelInode)
      cxiDropInvalidDCacheEntries(cnP);
  }

  TRACE4(TRACE_VNODE, 3, TRCID_CXIUPDATE_INODE_2,
     "cxiUpdateInode: iP 0x%X atime 0x%X mtime 0x%X ctime 0x%X\n",
      iP, iP->i_atime, iP->i_mtime, iP->i_ctime);
  TRACE7(TRACE_VNODE, 3, TRCID_CXIUPDATE_INODE_1,
     "cxiUpdateInode: what %d mode 0x%X uid %d gid %d nlink %d size %lld"
     " blocks %d\n",
     what, iP->i_mode, iP->i_uid, iP->i_gid, iP->i_nlink,
     iP->i_size, iP->i_blocks);
}

/* Determine if operating system specific node belongs to a particular VFS and
   can be uncached.  Returns OS node if it exists, the determination of
   whether it can be uncached or not. */
Boolean
cxiCanUncacheOSNode(void *osVfsP, struct cxiNode_t *cnP, void **vP)
{
  struct inode *iP = (struct inode *)cnP->osNodeP;
  int count = 0;

  if (iP != NULL && iP->i_sb == osVfsP)
  {
    count = atomic_read((atomic_t *)&iP->i_count);
    *vP = (void *)iP;
  }
  else
    *vP = NULL;

  TRACE6(TRACE_VNODE, 2, TRCID_LINUXOPS_CANUNCACHE_OSNODE,
         "cxiCanUncacheOSNode: cxiNode 0x%lx vP 0x%lX osVfsP 0x%lX "
         "i_sb 0x%lX inode %d i_count %d\n", cnP, vP, osVfsP,
         (iP ? iP->i_sb : 0), (iP ? iP->i_ino : 0), count);
  return (count == 0);
}


/* Add operating system specific node to the lookup cache.
   This routine is called with the necessary distributed lock held to
   guarantee that the lookup cache entry is valid. */
void cxiAddOSNode(void *dentryP, void *vP)
{
  struct inode *iP = (struct inode *)vP;
  struct dentry *dP = (struct dentry *)dentryP;

  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_ADD_OSNODE,
         "cxiAddOSNode: dentry 0x%lx vP 0x%lX unhashed %d",
         dentryP, vP, d_unhashed(dP));

  /* mark dentry valid */
  dP->d_op = &gpfs_dops_valid;

  /* hook up dentry and inode */
  d_instantiate(dP, iP);

  /* if not yet done so, add to hash list */
  if (d_unhashed(dP))
    d_rehash(dP);
}
